As discussed in chapter 6, logistic regression estimates a linear relationship between a set of features and a binary outcome, mediated by a sigmoid function to ensure the model produces probabilities. The frequentist approach resulted in point estimates for the parameters that measure the influence of each feature on the probability that a data point belongs to the positive class, with confidence intervals based on assumptions about the parameter distribution.
In contrast, Bayesian logistic regression estimates the posterior distribution over the parameters itself. The posterior allows for more robust estimates of what is called a Bayesian credible interval for each parameter with the benefit of more transparency about the model’s uncertainty.
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
from scipy import stats
import pandas_datareader.data as web
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_auc_score
import theano
import pymc3 as pm
import arviz
from pymc3.variational.callbacks import CheckParametersConvergence
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from matplotlib import animation
import seaborn as sns
from IPython.display import HTML
sns.set_style('whitegrid')
data_path = Path('data')
fig_path = Path('figures')
model_path = Path('models')
for p in [data_path, fig_path, model_path]:
if not p.exists():
p.mkdir()
We will use a small and simple dataset so we can focus on the workflow. We use the Federal Reserve’s Economic Data (FRED) service (see Chapter 2) to download the US recession dates as defined by the National Bureau of Economic Research. We also source four variables that are commonly used to predict the onset of a recession (Kelley 2019) and available via FRED, namely:
indicators = ['JHDUSRGDPBR', 'T10Y3M', 'NFCI', 'NFCINONFINLEVERAGE', 'UMCSENT']
var_names = ['recession', 'yield_curve', 'financial_conditions', 'leverage', 'sentiment']
features = var_names[1:]
label = var_names[0]
var_display = ['Recession', 'Yield Curve', 'Financial Conditions', 'Leverage', 'Sentiment']
col_dict = dict(zip(var_names, var_display))
data = (web.DataReader(indicators, 'fred', 1980, 2020)
.ffill()
.resample('M')
.last()
.dropna())
data.columns = var_names
We standardize the features so all have mean 0 standard deviation of 1:
data.loc[:, features] = scale(data.loc[:, features])
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 457 entries, 1982-01-31 to 2020-01-31 Freq: M Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 recession 457 non-null float64 1 yield_curve 457 non-null float64 2 financial_conditions 457 non-null float64 3 leverage 457 non-null float64 4 sentiment 457 non-null float64 dtypes: float64(5) memory usage: 21.4 KB
mi = []
months = list(range(1, 25))
for month in months:
df_ = data.copy()
df_[label] = df_[label].shift(-month)
df_ = df_.dropna()
mi.append(mutual_info_classif(df_.loc[:, features], df_[label]))
mi = pd.DataFrame(mi, columns=features, index=months)
mi.sum(1).mul(100).iloc[:12].sort_index(ascending=False).plot.barh(figsize=(12, 4), xlim=(15, 40));
fig, ax = plt.subplots(figsize=(20, 3))
sns.heatmap(mi.rename(columns=col_dict).T*100, cmap='Greys', ax=ax, annot=True, fmt='.1f', cbar=False)
ax.set_xlabel('Lead Time (Months)')
ax.set_title('Mutual Information between Indicators and Recession by Lead Time')
fig.tight_layout();
data[label] = data[label].shift(-12)
data = data.dropna()
data_ = pd.melt(data.rename(columns=col_dict), id_vars='Recession').rename(columns=str.capitalize)
g = sns.catplot(x='Recession', y='Value', col='Variable', data=data_, kind='box');
X = data.loc[:, features]
y = data[label]
y.value_counts()
0.0 396 1.0 49 Name: recession, dtype: int64
data.to_csv('data/recessions.csv')
data = pd.read_csv('data/recessions.csv', index_col=0)
data.info()
<class 'pandas.core.frame.DataFrame'> Index: 445 entries, 1982-01-31 to 2019-01-31 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 recession 445 non-null float64 1 yield_curve 445 non-null float64 2 financial_conditions 445 non-null float64 3 leverage 445 non-null float64 4 sentiment 445 non-null float64 dtypes: float64(5) memory usage: 20.9+ KB
simple_model = 'recession ~ yield_curve + leverage'
full_model = simple_model + ' + financial_conditions + sentiment'
A probabilistic program consists of observed and unobserved random variables (RVs). As discussed, we define the observed RVs via likelihood distributions and unobserved RVs via prior distributions. PyMC3 includes numerous probability distributions for this purpose.
The PyMC3 library makes it very straightforward to perform approximate Bayesian inference for logistic regression. Logistic regression models the probability that individual i earns a high income based on k features as outlined in the below figure that uses plate notation:
We will use the context manager with to define a manual_logistic_model that we can refer to later as a probabilistic model:
with pm.Model() as manual_logistic_model:
# random variables for coefficients with
# uninformative priors for each parameter
intercept = pm.Normal('intercept', 0, sd=100)
beta_1 = pm.Normal('beta_1', 0, sd=100)
beta_2 = pm.Normal('beta_2', 0, sd=100)
# Transform random variables into vector of probabilities p(y_i=1)
# according to logistic regression model specification.
likelihood = pm.invlogit(intercept +
beta_1 * data.yield_curve +
beta_2 * data.leverage)
# Bernoulli random vector with probability of success
# given by sigmoid function and actual data as observed
pm.Bernoulli(name='logit',
p=likelihood,
observed=data.recession)
manual_logistic_model.model
The command pm.model_to_graphviz(manual_logistic_model) produces the plate notation displayed below.
It shows the unobserved parameters as light and the observed elements as dark circles. The rectangle indicates the number of repetitions of the observed model element implied by the data included in the model definition.
pm.model_to_graphviz(manual_logistic_model)
# opionally: persist
# graph = pm.model_to_graphviz(manual_logistic_model)
# graph.save('log_reg.dot')
with manual_logistic_model:
# compute maximum a-posteriori estimate
# for logistic regression weights
manual_map_estimate = pm.find_MAP()
def print_map(result):
return pd.Series({k: np.asscalar(v) for k, v in result.items()})
print_map(manual_map_estimate)
intercept -3.891662 beta_1 -1.504658 beta_2 1.524843 dtype: float64
PyMC3 includes numerous common models so that we can usually leave the manual specification for custom applications.
The following code defines the same logistic regression as a member of the Generalized Linear Models (GLM) family using the formula format inspired by the statistical language R and ported to python by the patsy library:
with pm.Model() as logistic_model:
pm.glm.GLM.from_formula(simple_model,
data,
family=pm.glm.families.Binomial())
pm.model_to_graphviz(logistic_model)
We obtain point MAP estimates for the three parameters using the just defined model’s .find_MAP() method:
with logistic_model:
map_estimate = pm.find_MAP()
PyMC3 solves the optimization problem of finding the posterior point with the highest density using the quasi-Newton Broyden-Fletcher-Goldfarb-Shanno (BFGS) algorithm but offers several alternatives provided by the scipy library. The result is virtually identically to the corresponding statsmodels estimate:
model = smf.logit(formula=simple_model, data=data)
result = model.fit()
print(result.summary())
Optimization terminated successfully.
Current function value: 0.174373
Iterations 8
Logit Regression Results
==============================================================================
Dep. Variable: recession No. Observations: 445
Model: Logit Df Residuals: 442
Method: MLE Df Model: 2
Date: Tue, 01 Jun 2021 Pseudo R-squ.: 0.4971
Time: 11:18:26 Log-Likelihood: -77.596
converged: True LL-Null: -154.30
Covariance Type: nonrobust LLR p-value: 4.855e-34
===============================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------
Intercept -3.8917 0.401 -9.717 0.000 -4.677 -3.107
yield_curve -1.5047 0.277 -5.435 0.000 -2.047 -0.962
leverage 1.5249 0.249 6.123 0.000 1.037 2.013
===============================================================================
print_map(map_estimate)
Intercept -3.891744 yield_curve -1.504697 leverage 1.524875 dtype: float64
result.params
Intercept -3.891746 yield_curve -1.504698 leverage 1.524875 dtype: float64
def plot_traces(traces, burnin=2000):
summary = arviz.summary(traces[burnin:])['mean'].to_dict()
ax = arviz.plot_trace(traces[burnin:],
figsize=(15, len(traces.varnames)*1.5),
lines=summary)
for i, mn in enumerate(summary.values()):
ax[i, 0].annotate(f'{mn:.2f}', xy=(mn, 0),
xycoords='data', xytext=(5, 10),
textcoords='offset points',
rotation=90, va='bottom',
fontsize='large',
color='#AA0022')
We will use the full model to illustrate Markov Chain Monte Carlo inference:
with pm.Model() as logistic_model:
pm.glm.GLM.from_formula(formula=full_model,
data=data,
family=pm.glm.families.Binomial())
logistic_model.basic_RVs
[Intercept ~ Flat, yield_curve ~ Normal, leverage ~ Normal, financial_conditions ~ Normal, sentiment ~ Normal, y ~ Binomial]
We will use the Metropolis-Hastings algorithm to sample from the posterior distribution.
Explore the hyperparameters of Metropolis-Hastings such as the proposal distribution variance to speed up the convergence.
Use plot_traces function to visually inspect the convergence.
You may also use MAP-estimate to initialize the sampling scheme to speed things up. This will make the warmup (burnin) period shorter since you will start from a probable point.
with logistic_model:
trace_mh = pm.sample(tune=1000,
draws=5000,
step=pm.Metropolis(),
cores=4)
Multiprocess sampling (4 chains in 4 jobs) CompoundStep >Metropolis: [sentiment] >Metropolis: [financial_conditions] >Metropolis: [leverage] >Metropolis: [yield_curve] >Metropolis: [Intercept]
Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 4 seconds. The estimated number of effective samples is smaller than 200 for some parameters.
plot_traces(trace_mh, burnin=0)
pm.trace_to_dataframe(trace_mh).info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20000 entries, 0 to 19999 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Intercept 20000 non-null float64 1 yield_curve 20000 non-null float64 2 leverage 20000 non-null float64 3 financial_conditions 20000 non-null float64 4 sentiment 20000 non-null float64 dtypes: float64(5) memory usage: 781.4 KB
with logistic_model:
trace_mh = pm.sample(draws=100000,
step=pm.Metropolis(),
trace=trace_mh)
Multiprocess sampling (4 chains in 4 jobs) CompoundStep >Metropolis: [sentiment] >Metropolis: [financial_conditions] >Metropolis: [leverage] >Metropolis: [yield_curve] >Metropolis: [Intercept]
Sampling 4 chains for 1_000 tune and 105_000 draw iterations (4_000 + 420_000 draws total) took 72 seconds. The number of effective samples is smaller than 10% for some parameters.
plot_traces(trace_mh, burnin=0)
# optionally: persist
# with open('logistic_model_mh.pkl', 'wb') as buff:
# pickle.dump({'model': logistic_model, 'trace': trace_mh}, buff)
# optionally: restore persisted model
# with open('logistic_model_mh.pkl', 'rb') as buff:
# pickled_data = pickle.load(buff)
# logistic_model, trace_mh = pickled_data['model'], pickled_data['trace']
arviz.summary(trace_mh)
| mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
|---|---|---|---|---|---|---|---|---|---|
| Intercept | -6.244 | 0.834 | -7.810 | -4.702 | 0.015 | 0.011 | 3203.0 | 6035.0 | 1.0 |
| yield_curve | -1.516 | 0.348 | -2.177 | -0.871 | 0.003 | 0.002 | 12295.0 | 23175.0 | 1.0 |
| leverage | 2.773 | 0.475 | 1.893 | 3.670 | 0.008 | 0.006 | 3798.0 | 8199.0 | 1.0 |
| financial_conditions | 1.575 | 0.345 | 0.937 | 2.232 | 0.005 | 0.004 | 4131.0 | 7581.0 | 1.0 |
| sentiment | 1.708 | 0.440 | 0.878 | 2.529 | 0.007 | 0.005 | 4193.0 | 8185.0 | 1.0 |
Using pm.sample without specifying a sampling method defaults to the No U-Turn Sampler, a form of Hamiltonian Monte Carlo that automatically tunes its parameters. It usually converges faster and gives less correlated samples compared to vanilla Metropolis-Hastings.
Note that variables measured on very different scales can slow down the sampling process. Hence, we first apply sklearn’s the scale() function to standardize the variables age, hours and educ.
Once we have defined our model as above with the new formula, we are ready to perform inference to approximate the posterior distribution. MCMC sampling algorithms are available through the pm.sample() function.
By default, PyMC3 automatically selects the most efficient sampler and initializes the sampling process for efficient convergence. For a continuous model, PyMC3 chooses the NUTS sampler discussed in the previous section. It also runs variational inference via ADVI to find good starting parameters for the sampler. One among several alternatives is to use the MAP estimate.
To see convergence looks like, we first draw only 100 samples after tuning the sampler for 1,000 iterations that will be discarded. The sampling process can be parallelized for multiple chains using the cores argument (except when using GPU).
draws = 100
tune = 1000
with logistic_model:
trace_NUTS = pm.sample(draws=draws,
tune=tune,
init='adapt_diag',
chains=4,
cores=1,
random_seed=42)
Only 100 samples in chain. Auto-assigning NUTS sampler... Initializing NUTS using adapt_diag... Sequential sampling (4 chains in 1 job) NUTS: [sentiment, financial_conditions, leverage, yield_curve, Intercept]
Sampling 4 chains for 1_000 tune and 100 draw iterations (4_000 + 400 draws total) took 6 seconds.
trace_df = pm.trace_to_dataframe(trace_NUTS).assign(
chain=lambda x: x.index // draws)
trace_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 400 entries, 0 to 399 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Intercept 400 non-null float64 1 yield_curve 400 non-null float64 2 leverage 400 non-null float64 3 financial_conditions 400 non-null float64 4 sentiment 400 non-null float64 5 chain 400 non-null int64 dtypes: float64(5), int64(1) memory usage: 18.9 KB
plot_traces(trace_NUTS, burnin=0)
The resulting trace contains the sampled values for each random variable. We can continue sampling by providing the trace of a prior run as input:
draws = 50000
chains = 4
with logistic_model:
trace_NUTS = pm.sample(draws=draws,
tune=tune,
init='adapt_diag',
trace=trace_NUTS,
chains=chains,
cores=1,
random_seed=42)
Auto-assigning NUTS sampler... Initializing NUTS using adapt_diag... Sequential sampling (4 chains in 1 job) NUTS: [sentiment, financial_conditions, leverage, yield_curve, Intercept]
Sampling 4 chains for 1_000 tune and 50_100 draw iterations (4_000 + 200_400 draws total) took 259 seconds.
plot_traces(trace_NUTS, burnin=1000)
# optional
# with open('logistic_model_nuts.pkl', 'wb') as buff:
# pickle.dump({'model': logistic_model,
# 'trace': trace_NUTS}, buff)
# with open('logistic_model_nuts.pkl', 'rb') as buff:
# pickled_data = pickle.load(buff)
# logistic_model, trace_NUTS = pickled_data['model'], pickled_data['trace']
df = pm.trace_to_dataframe(trace_NUTS).iloc[200:].reset_index(
drop=True).assign(chain=lambda x: x.index // draws)
trace_df = pd.concat([trace_df.assign(samples=100),
df.assign(samples=len(df) + len(trace_df))])
trace_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 200600 entries, 0 to 200199 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Intercept 200600 non-null float64 1 yield_curve 200600 non-null float64 2 leverage 200600 non-null float64 3 financial_conditions 200600 non-null float64 4 sentiment 200600 non-null float64 5 chain 200600 non-null int64 6 samples 200600 non-null int64 dtypes: float64(5), int64(2) memory usage: 12.2 MB
trace_df_long = pd.melt(trace_df, id_vars=['samples', 'chain'])
trace_df_long.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1003000 entries, 0 to 1002999 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 samples 1003000 non-null int64 1 chain 1003000 non-null int64 2 variable 1003000 non-null object 3 value 1003000 non-null float64 dtypes: float64(1), int64(2), object(1) memory usage: 30.6+ MB
g = sns.FacetGrid(trace_df_long, col='variable', row='samples',
hue='chain', sharex='col', sharey=False)
g = g.map(sns.distplot, 'value', hist=False, rug=False);
model = smf.logit(formula=full_model, data=data)
result = model.fit()
print(result.summary())
Optimization terminated successfully.
Current function value: 0.143054
Iterations 10
Logit Regression Results
==============================================================================
Dep. Variable: recession No. Observations: 445
Model: Logit Df Residuals: 440
Method: MLE Df Model: 4
Date: Tue, 01 Jun 2021 Pseudo R-squ.: 0.5874
Time: 11:27:15 Log-Likelihood: -63.659
converged: True LL-Null: -154.30
Covariance Type: nonrobust LLR p-value: 3.941e-38
========================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------
Intercept -5.9230 0.794 -7.464 0.000 -7.478 -4.368
yield_curve -1.4528 0.333 -4.359 0.000 -2.106 -0.800
leverage 2.6125 0.452 5.780 0.000 1.727 3.498
financial_conditions 1.5035 0.332 4.533 0.000 0.853 2.153
sentiment 1.5955 0.423 3.772 0.000 0.766 2.425
========================================================================================
Possibly complete quasi-separation: A fraction 0.24 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
arviz.summary(trace_NUTS).assign(statsmodels=result.params).to_csv(model_path / 'trace_nuts.csv')
arviz.summary(trace_NUTS).assign(statsmodels=result.params)
| mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | statsmodels | |
|---|---|---|---|---|---|---|---|---|---|---|
| Intercept | -6.232 | 0.845 | -7.826 | -4.686 | 0.004 | 0.003 | 58539.0 | 70467.0 | 1.0 | -5.922991 |
| yield_curve | -1.518 | 0.351 | -2.183 | -0.865 | 0.001 | 0.001 | 97079.0 | 98113.0 | 1.0 | -1.452784 |
| leverage | 2.763 | 0.481 | 1.890 | 3.683 | 0.002 | 0.001 | 65290.0 | 83779.0 | 1.0 | 2.612501 |
| financial_conditions | 1.570 | 0.350 | 0.915 | 2.227 | 0.001 | 0.001 | 65638.0 | 81995.0 | 1.0 | 1.503479 |
| sentiment | 1.700 | 0.447 | 0.868 | 2.540 | 0.002 | 0.001 | 64596.0 | 81854.0 | 1.0 | 1.595507 |
We can compute the credible intervals, the Bayesian counterpart of confidence intervals, as percentiles of the trace. The resulting boundaries reflect our confidence about the range of the parameter value for a given probability threshold, as opposed to the number of times the parameter will be within this range for a large number of trials.
def get_credible_int(trace, param):
b = trace[param]
lb, ub = np.percentile(b, 2.5), np.percentile(b, 97.5)
lb, ub = np.exp(lb), np.exp(ub)
return b, lb, ub
b = trace_NUTS['yield_curve']
lb, ub = np.percentile(b, 2.5), np.percentile(b, 97.5)
lb, ub = np.exp(lb), np.exp(ub)
print(f'P({lb:.3f} < Odds Ratio < {ub:.3f}) = 0.95')
P(0.106 < Odds Ratio < 0.422) = 0.95
b, lb, ub = get_credible_int(trace_NUTS, 'yield_curve')
print(f'P({lb:.3f} < Odds Ratio < {ub:.3f}) = 0.95')
P(0.106 < Odds Ratio < 0.422) = 0.95
fig, axes = plt.subplots(figsize=(14, 4), ncols=2)
b, lb, ub = get_credible_int(trace_NUTS, 'yield_curve')
sns.distplot(np.exp(b), axlabel='Odds Ratio', ax=axes[0])
axes[0].set_title(f'Yield Curve: P({lb:.3f} < Odds Ratio < {ub:.3f}) = 0.95')
axes[0].axvspan(lb, ub, alpha=0.5, color='gray')
b, lb, ub = get_credible_int(trace_NUTS, 'leverage')
sns.distplot(np.exp(b), axlabel='Odds Ratio', ax=axes[1])
axes[1].set_title(f'Leverage: P({lb:.3f} < Odds Ratio < {ub:.3f}) = 0.95')
axes[1].axvspan(lb, ub, alpha=0.5, color='gray')
fig.suptitle('Credible Intervals', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9);
The interface for variational inference is very similar to the MCMC implementation. We just use the fit() instead of the sample() function, with the option to include an early stopping CheckParametersConvergence callback if the distribution-fitting process converged up to a given tolerance:
with logistic_model:
callback = CheckParametersConvergence(diff='absolute')
approx = pm.fit(n=100000,
callbacks=[callback])
Finished [100%]: Average Loss = 98.846
# optional
# with open(logistic_model_advi.pkl', 'wb') as buff:
# pickle.dump({'model': logistic_model,
# 'approx': approx}, buff)
We can draw samples from the approximated distribution to obtain a trace object as above for the MCMC sampler:
trace_advi = approx.sample(10000)
arviz.summary(trace_advi)
arviz - WARNING - Shape validation failed: input_shape: (1, 10000), minimum_shape: (chains=2, draws=4)
| mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
|---|---|---|---|---|---|---|---|---|---|
| Intercept | -6.099 | 0.255 | -6.585 | -5.630 | 0.003 | 0.002 | 9864.0 | 9020.0 | NaN |
| yield_curve | -1.484 | 0.206 | -1.870 | -1.098 | 0.002 | 0.001 | 9930.0 | 9385.0 | NaN |
| leverage | 2.701 | 0.203 | 2.315 | 3.078 | 0.002 | 0.001 | 9852.0 | 9719.0 | NaN |
| financial_conditions | 1.529 | 0.183 | 1.185 | 1.873 | 0.002 | 0.001 | 10168.0 | 9879.0 | NaN |
| sentiment | 1.651 | 0.212 | 1.242 | 2.041 | 0.002 | 0.002 | 9924.0 | 9427.0 | NaN |
arviz.summary(trace_advi).to_csv(model_path / 'trace_advi.csv')
arviz - WARNING - Shape validation failed: input_shape: (1, 10000), minimum_shape: (chains=2, draws=4)
Bayesian model diagnostics includes validating that the sampling process has converged and consistently samples from high-probability areas of the posterior, and confirming that the model represents the data well.
For high-dimensional models with many variables, it becomes cumbersome to inspect numerous at traces. When using NUTS, the energy plot helps to assess problems of convergence. It summarizes how efficiently the random process explores the posterior. The plot shows the energy and the energy transition matrix that should be well matched as in the below example (see references for conceptual detail).
arviz.plot_energy(trace_NUTS);
arviz.plot_forest(trace_NUTS);
fig, axes = plt.subplots(ncols=2, figsize=(14, 4))
arviz.plot_forest(trace_NUTS, ax=axes[0])
axes[0].set_title('Forest Plot')
arviz.plot_energy(trace_NUTS, ax=axes[1])
axes[1].set_title('Energy Plot')
fig.tight_layout();
PPCs are very useful for examining how well a model fits the data. They do so by generating data from the model using parameters from draws from the posterior. We use the function pm.sample_ppc for this purpose and obtain n samples for each observation (the GLM module automatically names the outcome ‘y’):
ppc = pm.sample_posterior_predictive(trace_NUTS, samples=500, model=logistic_model)
ppc['y'].shape
(500, 445)
y_score = np.mean(ppc['y'], axis=0)
roc_auc_score(y_score=np.mean(ppc['y'], axis=0),
y_true=data.recession)
0.961837765409194
Predictions use theano’s shared variables to replace the training data with test data before running posterior predictive checks. To facilitate visualization, we create a variable with a single predictor hours, create the train and test datasets, and convert the former to a shared variable. Note that we need to use numpy arrays and provide a list of column labels:
X = data[['yield_curve']]
labels = X.columns
y = data.recession
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=42,
stratify=y)
X_shared = theano.shared(X_train.values)
with pm.Model() as logistic_model_pred:
pm.glm.GLM(x=X_shared,
labels=labels,
y=y_train,
family=pm.glm.families.Binomial())
with logistic_model_pred:
pred_trace = pm.sample(draws=10000,
tune=1000,
chains=2,
cores=1,
init='adapt_diag')
Auto-assigning NUTS sampler... Initializing NUTS using adapt_diag... Sequential sampling (2 chains in 1 job) NUTS: [yield_curve, Intercept]
Sampling 2 chains for 1_000 tune and 10_000 draw iterations (2_000 + 20_000 draws total) took 15 seconds. The acceptance probability does not match the target. It is 0.8816539713275935, but should be close to 0.8. Try to increase the number of tuning steps. The number of effective samples is smaller than 25% for some parameters.
We then run the sampler as before, and apply the pm.sample_ppc function to the resulting trace after replacing the train with test data:
X_shared.set_value(X_test)
ppc = pm.sample_posterior_predictive(pred_trace,
model=logistic_model_pred,
samples=100)
y_score = np.mean(ppc['y'], axis=0)
roc_auc_score(y_score=np.mean(ppc['y'], axis=0),
y_true=y_test)
0.8740506329113924
def invlogit(x):
return np.exp(x) / (1 + np.exp(x))
x = X_test.yield_curve
fig, ax = plt.subplots(figsize=(14, 5))
β = stats.beta((ppc['y'] == 1).sum(axis=0), (ppc['y'] == 0).sum(axis=0))
# estimated probability
ax.scatter(x=x, y=β.mean())
# error bars on the estimate
plt.vlines(x, *β.interval(0.95))
# actual outcomes
ax.scatter(x=x, y=y_test, marker='x')
# True probabilities
x_ = np.linspace(x.min()*1.05, x.max()*1.05, num=100)
ax.plot(-x_, invlogit(x_), linestyle='-')
ax.set_xlabel('Yield Curve')
ax.set_ylabel('Recession')
ax.invert_xaxis()
fig.tight_layout();
The code is based on MCMC visualization tutorial.
# Number of MCMC iteration to animate.
burnin = 1000
samples = 1000
var1 = 'yield_curve'
var1_range = (trace_df[var1].min() * .95, trace_df[var1].max() * 1.05)
var2 = 'sentiment'
var2_range = (trace_df[var2].min() * .95, trace_df[var2].max() * 1.05)
Writer = animation.writers['ffmpeg']
writer = Writer(fps=15, metadata=dict(artist='Me'), bitrate=1800)
with pm.Model() as logistic_model:
pm.glm.GLM.from_formula(formula=full_model,
data=data,
family=pm.glm.families.Binomial())
def init():
for line in lines:
line.set_data([], [])
return lines
def animate(i):
trace = trace_df.iloc[:i+1]
idx = list(range(len(trace)))
line1.set_data(trace[var1].iloc[::-1], idx)
line2.set_data(idx, trace[var2].iloc[::-1])
line3.set_data(trace[var1], trace[var2])
line4.set_data(trace[var1], trace[var2])
line5.set_data([trace[var1].iloc[-1], trace[var1].iloc[-1]], [trace[var2].iloc[-1], var2_range[1]])
line6.set_data([trace[var1].iloc[-1], var1_range[1]], [trace[var2].iloc[-1], trace[var2].iloc[-1]])
return lines
with logistic_model:
nuts_trace = pm.sample(draws=samples, tune=burnin,
init='adapt_diag',
chains=1)
trace_df = pm.trace_to_dataframe(nuts_trace)
trace_df.to_csv('trace.csv', index=False)
trace_df = pd.read_csv('trace.csv')
print(trace_df.info())
Auto-assigning NUTS sampler... Initializing NUTS using adapt_diag... Sequential sampling (1 chains in 1 job) NUTS: [sentiment, financial_conditions, leverage, yield_curve, Intercept]
Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 3 seconds. Only one chain was sampled, this makes it impossible to run some convergence checks
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Intercept 1000 non-null float64 1 yield_curve 1000 non-null float64 2 leverage 1000 non-null float64 3 financial_conditions 1000 non-null float64 4 sentiment 1000 non-null float64 dtypes: float64(5) memory usage: 39.2 KB None
fig = plt.figure(figsize=(8, 8))
ax1 = fig.add_subplot(221, xlim=var1_range, ylim=(0, samples))
ax2 = fig.add_subplot(224, xlim=(0, samples), ylim=var2_range)
ax3 = fig.add_subplot(223, xlim=var1_range, ylim=var2_range,
xlabel=var1, ylabel=var2)
fig.subplots_adjust(wspace=0.0, hspace=0.0)
line1, = ax1.plot([], [], lw=1)
line2, = ax2.plot([], [], lw=1)
line3, = ax3.plot([], [], 'o', lw=2, alpha=.1)
line4, = ax3.plot([], [], lw=1, alpha=.3)
line5, = ax3.plot([], [], 'k', lw=1)
line6, = ax3.plot([], [], 'k', lw=1)
ax1.set_xticklabels([])
ax2.set_yticklabels([])
lines = [line1, line2, line3, line4, line5, line6]
anim = animation.FuncAnimation(fig,
animate,
init_func=init,
frames=samples,
interval=5,
blit=True);
# save
# anim.save('nuts.mp4', writer=writer)
# or display; either requres ffmpeg installation
HTML(anim.to_html5_video())
with logistic_model:
step = pm.Metropolis()
mh_trace = pm.sample(draws=samples, tune=burnin,
step=step,
chains=1)
trace_df = pm.trace_to_dataframe(mh_trace)
Sequential sampling (1 chains in 1 job) CompoundStep >Metropolis: [sentiment] >Metropolis: [financial_conditions] >Metropolis: [leverage] >Metropolis: [yield_curve] >Metropolis: [Intercept]
Sampling 1 chain for 1_000 tune and 1_000 draw iterations (1_000 + 1_000 draws total) took 1 seconds. Only one chain was sampled, this makes it impossible to run some convergence checks
fig = plt.figure(figsize=(8, 8))
ax1 = fig.add_subplot(221, xlim=var1_range, ylim=(0, samples))
ax2 = fig.add_subplot(224, xlim=(0, samples), ylim=var2_range)
ax3 = fig.add_subplot(223, xlim=var1_range, ylim=var2_range,
xlabel=var1, ylabel=var2)
fig.subplots_adjust(wspace=0.0, hspace=0.0)
line1, = ax1.plot([], [], lw=1)
line2, = ax2.plot([], [], lw=1)
line3, = ax3.plot([], [], 'o', lw=2, alpha=.1)
line4, = ax3.plot([], [], lw=1, alpha=.3)
line5, = ax3.plot([], [], 'k', lw=1)
line6, = ax3.plot([], [], 'k', lw=1)
ax1.set_xticklabels([])
ax2.set_yticklabels([])
lines = [line1, line2, line3, line4, line5, line6]
def init():
for line in lines:
line.set_data([], [])
return lines
def animate(i):
trace = trace_df.iloc[:i+1]
idx = list(range(len(trace)))
line1.set_data(trace[var1].iloc[::-1], idx)
line2.set_data(idx, trace[var2].iloc[::-1])
line3.set_data(trace[var1], trace[var2])
line4.set_data(trace[var1], trace[var2])
line5.set_data([trace[var1].iloc[-1], trace[var1].iloc[-1]], [trace[var2].iloc[-1], var2_range[1]])
line6.set_data([trace[var1].iloc[-1], var1_range[1]], [trace[var2].iloc[-1], trace[var2].iloc[-1]])
return lines
anim = animation.FuncAnimation(fig,
animate,
init_func=init,
frames=samples,
interval=5,
blit=True)
HTML(anim.to_html5_video())